<?php
/**
 * {project-name}
 *
 * @author {author-name}
 */
declare(strict_types=1);

namespace App\Job;

use PHPHtmlParser\Dom;
use Spiral\Jobs\JobHandler;
use Spiral\Prototype\Traits\PrototypeTrait;

class UrlScrapeJob extends JobHandler
{
    use PrototypeTrait;

    public function invoke(int $depth, string $url): void
    {
        if ($this->files->exists(directory('runtime') . md5($url) . '.html')) {
            // 跳过已抓取的页面
            return;
        }

        $body = file_get_contents($url);
        $localFile = $this->store($url, $body);

        $dom = new Dom();
        $dom->loadFromFile($localFile);

        foreach ($dom->find('a') as $a) {
            $next = $this->nextURL($url, $a->href);

            if ($next !== null && $depth > 1) {
                $this->queue->push(self::class, ['depth' => $depth - 1, 'url' => $next]);
            }
        }
    }

    private function store(string $url, string $body): string
    {
        $localFile = directory('runtime') . md5($url) . '.html';
        $this->files->write($localFile, $body);

        $this->files->append(
            directory('runtime') . 'scrape.log',
            sprintf("%s,%s,%s\n", date('c'), md5($url), $url)
        );

        return $localFile;
    }

    private function nextURL(string $base, ?string $target): ?string
    {
        if ($target == null) {
            return null;
        }

        $base_url = parse_url($base);
        $target_url = parse_url($target);

        if (isset($target_url['scheme']) && isset($target_url['host'])) {
            if ($target_url['host'] !== $base_url['host']) {
                // 只抓取与起点同域名下的链接
                return null;
            }

            // url 包含 scheme 和 host 的完整链接直接返回
            return $target;
        }

        if (!isset($target_url['path'])) {
            return null;
        }

        // 这里是把站内相对链接变成完整链接返回
        return sprintf("%s://%s%s", $base_url['scheme'], $base_url['host'], $target_url['path']);
    }
}
